import os
from pathlib import Path
import json
import linecache
import functools
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import folium
from shapely.ops import nearest_points
from shapely.geometry import LineString
DATA_PATH = Path('/data/safegraph/safegraph_open_census_data')
PREPROCESSED_DATA_PATH = Path('../../../data/preprocessed/safegraph/safegraph_open_census_data')
# Only needs to be run once
county_fips_code = '45039' # Fairfield County, South Carolina
!mkdir -p {PREPROCESSED_DATA_PATH}
census_data_file_names = !ls {DATA_PATH}/data/ | grep [0-9] # | cut -f 1 -d . # eliminate .csv suffix
county_directory = PREPROCESSED_DATA_PATH / "data/county" / county_fips_code
!mkdir -p {county_directory}
for file_name in census_data_file_names:
!touch {county_directory}/{file_name}
print(county_directory/file_name)
!head -n 1 "{DATA_PATH}/data/{file_name}" > {county_directory}/{file_name}
!cat "{DATA_PATH}/data/{file_name}" | grep ^{county_fips_code}.*$ >> {county_directory}/{file_name}
# See: https://www.safegraph.com/blog/beginners-guide-to-census
table_ids = [
'B01001e1', # SEX BY AGE: Total: Total population -- (Estimate),Sex By Age, Total, Total Population -- (Estimate),,,,,
#'B00001e1', # UNWEIGHTED SAMPLE COUNT OF THE POPULATION: Total: Total population -- (Estimate),Unweighted Sample Count Of The Population, Total, Total Population -- (Estimate),,,,,
#'B00001m1', # UNWEIGHTED SAMPLE COUNT OF THE POPULATION: Total: Total population -- (Margin of Error),Unweighted Sample Count Of The Population, Total, Total Population -- (Margin Of Error),,,,,
'B19013e1', # Median Household Income
#'B00002e1', # UNWEIGHTED SAMPLE HOUSING UNITS: Total: Housing units -- (Estimate),Unweighted Sample Housing Units, Total, Housing Units -- (Estimate),,,,,
#'B00002m1', # UNWEIGHTED SAMPLE HOUSING UNITS: Total: Housing units -- (Margin of Error),Unweighted Sample Housing Units, Total, Housing Units -- (Margin Of Error),,,,,
'B25001e1', # HOUSING UNITS: Total: Housing units -- (Estimate),Housing Units, Total, Housing Units -- (Estimate),,,,,
#'B25001m1', # HOUSING UNITS: Total: Housing units -- (Margin of Error),Housing Units, Total, Housing Units -- (Margin Of Error),,,,,
]
cbg_field_desc = pd.read_csv(DATA_PATH / 'metadata/cbg_field_descriptions.csv')
cbg_field_desc[cbg_field_desc.table_id.isin(table_ids)]
county_files = ['cbg_b00.csv', 'cbg_b01.csv', 'cbg_b19.csv', 'cbg_b25.csv'] # !ls {county_directory}
dfs = [pd.read_csv(county_directory / file, dtype={'census_block_group': str}) for file in county_files]
merged = dfs[0]
for df in dfs[1:]:
merged = pd.merge(merged, df, on=['census_block_group'])
cbg_data = merged
#combiner = lambda left, right: pd.merge(left, right, on=['census_block_group'])
#cbg_data = functools.reduce(combiner, dfs[0], dfs[1:])
'''cbg_b19 = pd.read_csv(county_directory / 'cbg_b19.csv', dtype={'census_block_group': str})
cbg_b01 = pd.read_csv(county_directory / 'cbg_b01.csv', dtype={'census_block_group': str})
cbg_data = pd.merge(cbg_b01, cbg_b19, on=['census_block_group'])'''
cbg_data = cbg_data[['census_block_group'] + table_ids]
#cbg_data.dropna().head()
cbg_data
sum(cbg_data['B01001e1']) # total population
cbg_data['household_size'] = cbg_data['B01001e1'] / cbg_data['B25001e1']
cbg_data
# Census Block Groups
def geojson_for_county(state_abbreviation="SC",
county_name="Fairfield County",
geojson_path=DATA_PATH / "geometry/cbg.geojson"):
header = !head -n 5 {geojson_path}
footer = !tail -n 2 {geojson_path}
# lines to search file for county of interest.
# must be found by inspection using "tail | head" method below, and checking whether
# the state of interest is included.
# If not included, search up or down via binary search (file is sorted by state)
# TODO: write the binary search explicitly here, if we need to generalize to other states/counties
line_start_search = 170000
line_end_search = 180000
num_lines = line_end_search - line_start_search
# stream = os.popen(f"""< {geojson_path} tail -n +{line_start_search} | head -n {num_lines} | grep '"State": "{state_abbreviation}", "County": "{county_name}"' """)
#stream = os.popen(f"""cat {geojson_path} | tail -n +{line_start_search} | head -n {num_lines} | grep '"State": "{state_abbreviation}", "County": "{county_name}"' """)
#county_cbgs = stream.readlines()
county_cbgs = [linecache.getline(str(geojson_path), line_number).strip() for line_number in range(line_start_search, line_end_search)]
county_cbgs = [line for line in county_cbgs if f'"State": "{state_abbreviation}", "County": "{county_name}"' in line]
print(len(county_cbgs))
# remove final character from last entry in list:
# a trailing "," that will mess up the JSON parsing
if county_cbgs[-1][-1] == ',':
county_cbgs[-1] = county_cbgs[-1][:-1]
return json.loads('\n'.join(header + county_cbgs + footer))
# only needs to be run once
geojson = geojson_for_county()
#len([f['properties']['CensusBlockGroup'] for f in geojson['features']])
#[f['properties']['CensusBlockGroup'] for f in cbgs_json['features']]
#geojson['features'][0]['properties']
m = folium.Map(location=[34.4, -81.1], zoom_start=10.5)
folium.Choropleth(
geo_data=geojson,
name='Median Household Income',
data=cbg_data,
columns=['census_block_group', 'B19013e1'], # B01001e1, # ['State', 'Unemployment'],
key_on='feature.properties.CensusBlockGroup',
fill_color='Greens',
fill_opacity=0.7,
line_opacity=0.2,
legend_name='Median Household Income ($)'
).add_to(m)
folium.Choropleth(
geo_data=geojson,
name='Population',
data=cbg_data,
columns=['census_block_group', 'B01001e1'], # , # ['State', 'Unemployment'],
key_on='feature.properties.CensusBlockGroup',
fill_color='Blues',
fill_opacity=0.7,
line_opacity=0.2,
legend_name='Population (# of people)'
).add_to(m)
folium.Choropleth(
geo_data=geojson,
name='Household Size',
data=cbg_data,
columns=['census_block_group', 'household_size'], # , # ['State', 'Unemployment'],
key_on='feature.properties.CensusBlockGroup',
fill_color='Reds',
fill_opacity=0.7,
line_opacity=0.2,
legend_name='Household Size (# of people)'
).add_to(m)
folium.LayerControl().add_to(m)
m